Data Input

rm(list = ls())
setwd("D:\\Google Drive\\TAMU\\Projets\\Hari\\Dissertation\\Paper 1")

library(tidyverse)
library(fBasics)
library(epiDisplay)
library(readxl)
input_data_main <- read_excel("Data/Sales Pipeline 20211011.xlsx", sheet = "report1633931123059")
dim(input_data_main)[1]
## [1] 5338

Filtering Data to use

# Keep only lost or won bids 
input_data <- input_data_main %>%dplyr::filter(Stage == "Closed Lost" |Stage == "Closed Won" ) 
dim(input_data)[1]
## [1] 2875
# Keep only bids between 2016 and 2021
data_till_2021 <-input_data %>%dplyr::filter(`Fiscal Year Sodexo` <=2021 &`Fiscal Year Sodexo` >=2016 )
dim(data_till_2021)[1]
## [1] 2476
data_till_2021 <- data_till_2021%>%drop_na(`Service Level 1`)
dim(data_till_2021)[1]
## [1] 2476
data_till_2021_new <- data_till_2021
data_till_2021_new <- data_till_2021_new%>% mutate(`Old Provider` =  
                                     case_when(`REP-Ultimate Parent Provided by Name` == "Aramark Group" ~ "Aramark", 
                                               
                                               `REP-Ultimate Parent Provided by Name` == "Compass Group" ~ "Compass",
                                               `REP-Ultimate Parent Provided by Name` == "Sodexo Group" ~ "Sodexo"))%>%
  dplyr::mutate(`Old Provider` = replace_na(`Old Provider`,"Other"))


data_till_2021_new <- data_till_2021_new%>% mutate(`New Provider` =  
                                     case_when(`REP-Service Ultimate Parent Awarded to` == "Aramark Group" ~ "Aramark", 
                                               
                                               `REP-Service Ultimate Parent Awarded to` == "Compass Group" ~ "Compass",
                                               `REP-Service Ultimate Parent Awarded to` == "Sodexo Group" ~ "Sodexo"))%>%
  dplyr::mutate(`New Provider` = replace_na(`New Provider`,"Other"))
tab1(data_till_2021_new$`Old Provider`)

## data_till_2021_new$`Old Provider` : 
##         Frequency Percent Cum. percent
## Aramark        69     2.8          2.8
## Compass       216     8.7         11.5
## Other        1468    59.3         70.8
## Sodexo        723    29.2        100.0
##   Total      2476   100.0        100.0
tab1(data_till_2021_new$`New Provider`)

## data_till_2021_new$`New Provider` : 
##         Frequency Percent Cum. percent
## Aramark        69     2.8          2.8
## Compass       222     9.0         11.8
## Other         933    37.7         49.4
## Sodexo       1252    50.6        100.0
##   Total      2476   100.0        100.0

Factorizing Data and Creating Dummies where needed

library(dummies)
# New Provider - Dummy coding 
new_provider <- as.data.frame(data_till_2021_new$`New Provider`)
colnames(new_provider)<- c("new_provider")
data_till_2021_new <- cbind(data_till_2021_new, dummy.data.frame(new_provider, sep = "_"))


# Creating Yeas<= 2016 Dummy
data_till_2021_new <- data_till_2021_new%>% mutate(`year_greater_than_2016` =  
                                                     case_when(`Fiscal Year Sodexo` <= 2016 ~ 0,  
                                                               `Fiscal Year Sodexo`  > 2016 ~ 1))
data_till_2021_new$year_greater_than_2016 <- as.factor(data_till_2021_new$year_greater_than_2016)
data_till_2021_new$year_greater_than_2016 <- relevel(data_till_2021_new$year_greater_than_2016 , ref = "0")

# Factorizing Opportunity Global Region

data_till_2021_new$`Opportunity Global Region` <- as.factor(data_till_2021_new$`Opportunity Global Region`)

data_till_2021_new$`Opportunity Global Region` <- relevel(data_till_2021_new$`Opportunity Global Region`, ref = "NORAM" )

# Factorizing SL1

data_till_2021_new$`Service Level 1` <- as.factor(data_till_2021_new$`Service Level 1`)

data_till_2021_new$`Service Level 1` <- relevel(data_till_2021_new$`Service Level 1`, ref = "Hard FM" )

# Factorizing Stategic Segment 1

data_till_2021_new$`Strategic Segment L1` <- as.factor(data_till_2021_new$`Strategic Segment L1`)

data_till_2021_new$`Strategic Segment L1` <- relevel(data_till_2021_new$`Strategic Segment L1`, ref = "Mining" )





# Factorizing Regional strategic Account

data_till_2021_new$`Regional Strategic Account` <- as.factor(data_till_2021_new$`Regional Strategic Account`)
data_till_2021_new$`Regional Strategic Account` <- relevel(data_till_2021_new$`Regional Strategic Account` , ref = "0")

# Converting Revenue (Annual) (converted) into Millions of Euros

data_till_2021_new$`Revenue (Annual) (converted) in Mill Euros`<-  data_till_2021_new$`Revenue (Annual) (converted)`/1e6


data_till_2021_new <- data_till_2021_new%>% drop_na(`Revenue (Annual) (converted) in Mill Euros`)
# Factorizing Old Provider

data_till_2021_new$`Old Provider` <- as.factor(data_till_2021_new$`Old Provider`)

data_till_2021_new$`Old Provider` <- relevel(data_till_2021_new$`Old Provider`, ref = "Other" )

# Factorizing New Provider

data_till_2021_new$`New Provider` <- as.factor(data_till_2021_new$`New Provider`)

data_till_2021_new$`New Provider` <- relevel(data_till_2021_new$`New Provider`, ref = "Other" )

Merging Strategic Emphasis variables

setwd("D:\\Google Drive\\TAMU\\Projets\\Hari\\Dissertation\\Paper 1")
Startegic_Emphasis_Data <- read_excel("Data/Startegic Emphasis Data.xlsx",sheet = "Sheet1")


library("reshape2")

Startegic_Emphasis_Data <- dcast(melt(Startegic_Emphasis_Data, id.vars=c("Fiscal Year Sodexo", "Company")), `Fiscal Year Sodexo`~variable+Company)

Startegic_Emphasis_Data_Combined <- read_excel("Data/Startegic Emphasis Data.xlsx",sheet = "Combined")
Startegic_Emphasis_Data_Combined <- dcast(melt(Startegic_Emphasis_Data_Combined, id.vars=c("Fiscal Year Sodexo", "Company")), `Fiscal Year Sodexo`~variable+Company)

Slpit Data into Train (80%) and Test (20%)

library(caret)   
library(splitTools)
library(ranger)

set.seed(12345)

set.seed(3451)
inds <- partition(data_till_2021_new$`New Provider`, p = c(train = 0.8,  test = 0.2))
#str(inds)

train_main <- data_till_2021_new[inds$train, ]
test_main <- data_till_2021_new[inds$test, ]

Analysis Without Strategic Emphasis variables

Data Setup

outcome <- "New Provider"
variables <- c("Old Provider","Opportunity Global Region","Service Level 1","Strategic Segment L1","Regional Strategic Account","Revenue (Annual) (converted) in Mill Euros","Margin (GP/BR)" )

all_variables <- c(outcome, variables)

train <- train_main %>% dplyr::select(all_of(all_variables))
test <- test_main %>%  dplyr::select(all_of(all_variables))
colnames(train) <- c("New.Provider","Old.Provider","Opportunity.Global.Region","Service.Level.1","Strategic.Segment.L1","Regional.Strategic.Account","Revenue.Mill.Euros","Margin" )
colnames(test) <- c("New.Provider","Old.Provider","Opportunity.Global.Region","Service.Level.1","Strategic.Segment.L1","Regional.Strategic.Account","Revenue.Mill.Euros","Margin" )

Random Forest

library(randomForest)
modfit.rf <- randomForest(New.Provider ~. , data=train)

GBM

## GBM
library(caret)
myControl <- trainControl(method='cv', number=5, returnResamp='none')

labelName <- 'New.Provider'
predictors <- names(train)[names(train) != labelName]


modfit.gbm <- train(train[,predictors], train[,labelName], method='gbm', trControl=myControl)

Multinomial Logit

f <- as.formula(paste(labelName, paste(predictors, collapse = " + "),sep = "~"))
modfit.mlogit  <- multinom(f, data =  train)

Naive Bayes

library(e1071)

modfit.naivebayes <- e1071::naiveBayes(New.Provider ~ ., data = train)

SVM

modfit.svm <- e1071::svm(New.Provider ~ ., data = train,probability = TRUE)

Confusion Matrices

#RF

prediction_rf_train <- predict(modfit.rf, train, type="class")
confusion_matrix_rf_train <- confusionMatrix(prediction_rf_train, train$New.Provider)
confusion_matrix_rf_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     665       6      29     84
##    Aramark     1      36       0      7
##    Compass     3       2     118      2
##    Sodexo     77      11      31    908
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8722          
##                  95% CI : (0.8567, 0.8866)
##     No Information Rate : 0.5056          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7806          
##                                           
##  Mcnemar's Test P-Value : 9.862e-10       
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.8914        0.65455        0.66292        0.9071
## Specificity                0.9036        0.99584        0.99612        0.8784
## Pos Pred Value             0.8482        0.81818        0.94400        0.8841
## Neg Pred Value             0.9323        0.99019        0.96765        0.9024
## Prevalence                 0.3768        0.02778        0.08990        0.5056
## Detection Rate             0.3359        0.01818        0.05960        0.4586
## Detection Prevalence       0.3960        0.02222        0.06313        0.5187
## Balanced Accuracy          0.8975        0.82519        0.82952        0.8928
prediction_rf_test <- predict(modfit.rf, test, type="class")
confusion_matrix_rf_test <- confusionMatrix(prediction_rf_test, test$New.Provider)
confusion_matrix_rf_test
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     135       1      15     39
##    Aramark     0      11       4      0
##    Compass     4       0      16      3
##    Sodexo     48       2       9    209
## 
## Overall Statistics
##                                           
##                Accuracy : 0.748           
##                  95% CI : (0.7074, 0.7856)
##     No Information Rate : 0.506           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5634          
##                                           
##  Mcnemar's Test P-Value : 0.008243        
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.7219        0.78571        0.36364        0.8327
## Specificity                0.8220        0.99170        0.98451        0.7592
## Pos Pred Value             0.7105        0.73333        0.69565        0.7799
## Neg Pred Value             0.8301        0.99376        0.94080        0.8158
## Prevalence                 0.3770        0.02823        0.08871        0.5060
## Detection Rate             0.2722        0.02218        0.03226        0.4214
## Detection Prevalence       0.3831        0.03024        0.04637        0.5403
## Balanced Accuracy          0.7720        0.88871        0.67407        0.7959
# GBM
train_preds <- as.data.frame(predict(object=modfit.gbm, train[,predictors]))
confusion_matrix_gbm_train <- confusionMatrix(train_preds$`predict(object = modfit.gbm, train[, predictors])`, train$New.Provider)
confusion_matrix_gbm_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     562       5      34    128
##    Aramark     3      43       2      7
##    Compass    16       1      99      7
##    Sodexo    165       6      43    859
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7894          
##                  95% CI : (0.7708, 0.8072)
##     No Information Rate : 0.5056          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6375          
##                                           
##  Mcnemar's Test P-Value : 1.132e-06       
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.7534        0.78182        0.55618        0.8581
## Specificity                0.8647        0.99377        0.98668        0.7814
## Pos Pred Value             0.7709        0.78182        0.80488        0.8006
## Neg Pred Value             0.8529        0.99377        0.95746        0.8434
## Prevalence                 0.3768        0.02778        0.08990        0.5056
## Detection Rate             0.2838        0.02172        0.05000        0.4338
## Detection Prevalence       0.3682        0.02778        0.06212        0.5419
## Balanced Accuracy          0.8090        0.88779        0.77143        0.8198
test_preds <- as.data.frame(predict(object=modfit.gbm, test[,predictors]))

confusion_matrix_gbm_test <- confusionMatrix(test_preds$`predict(object = modfit.gbm, test[, predictors])`, test$New.Provider)
confusion_matrix_gbm_test
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     127       1      10     45
##    Aramark     0      10       5      0
##    Compass     3       0      17      4
##    Sodexo     57       3      12    202
## 
## Overall Statistics
##                                          
##                Accuracy : 0.7177         
##                  95% CI : (0.6759, 0.757)
##     No Information Rate : 0.506          
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.5101         
##                                          
##  Mcnemar's Test P-Value : 0.005795       
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.6791        0.71429        0.38636        0.8048
## Specificity                0.8188        0.98963        0.98451        0.7061
## Pos Pred Value             0.6940        0.66667        0.70833        0.7372
## Neg Pred Value             0.8083        0.99168        0.94280        0.7793
## Prevalence                 0.3770        0.02823        0.08871        0.5060
## Detection Rate             0.2560        0.02016        0.03427        0.4073
## Detection Prevalence       0.3690        0.03024        0.04839        0.5524
## Balanced Accuracy          0.7490        0.85196        0.68544        0.7555
# MLogit

prediction_mlogit_train <- predict(modfit.mlogit, train, type="class")
confusion_matrix_mlogit_train <- confusionMatrix(prediction_mlogit_train, train$New.Provider)
confusion_matrix_mlogit_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     365       5      24    203
##    Aramark    10      21       4      8
##    Compass    13       2      57     25
##    Sodexo    358      27      93    765
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6101          
##                  95% CI : (0.5882, 0.6317)
##     No Information Rate : 0.5056          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3087          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.4893        0.38182        0.32022        0.7642
## Specificity                0.8120        0.98857        0.97780        0.5117
## Pos Pred Value             0.6114        0.48837        0.58763        0.6154
## Neg Pred Value             0.7245        0.98245        0.93574        0.6798
## Prevalence                 0.3768        0.02778        0.08990        0.5056
## Detection Rate             0.1843        0.01061        0.02879        0.3864
## Detection Prevalence       0.3015        0.02172        0.04899        0.6278
## Balanced Accuracy          0.6506        0.68519        0.64901        0.6380
prediction_mlogit_test <- predict(modfit.mlogit, test, type="class")
confusion_matrix_mlogit_test <- confusionMatrix(prediction_mlogit_test, test$New.Provider)
confusion_matrix_mlogit_test
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other      88       2       4     45
##    Aramark     0       8       3      1
##    Compass     2       0      15     11
##    Sodexo     97       4      22    194
## 
## Overall Statistics
##                                          
##                Accuracy : 0.6149         
##                  95% CI : (0.5705, 0.658)
##     No Information Rate : 0.506          
##     P-Value [Acc > NIR] : 6.929e-07      
##                                          
##                   Kappa : 0.3187         
##                                          
##  Mcnemar's Test P-Value : 3.640e-05      
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.4706        0.57143        0.34091        0.7729
## Specificity                0.8350        0.99170        0.97124        0.4980
## Pos Pred Value             0.6331        0.66667        0.53571        0.6120
## Neg Pred Value             0.7227        0.98760        0.93803        0.6816
## Prevalence                 0.3770        0.02823        0.08871        0.5060
## Detection Rate             0.1774        0.01613        0.03024        0.3911
## Detection Prevalence       0.2802        0.02419        0.05645        0.6391
## Balanced Accuracy          0.6528        0.78156        0.65607        0.6354
# Naive Bayes

prediction_naivebayes_train <- predict(modfit.naivebayes, train, type="class")
confusion_matrix_naivebayes_train <- confusionMatrix(prediction_naivebayes_train, train$New.Provider)
confusion_matrix_naivebayes_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     344       1      17    226
##    Aramark    25      30      11     30
##    Compass    11       0      20     26
##    Sodexo    366      24     130    719
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5621          
##                  95% CI : (0.5399, 0.5841)
##     No Information Rate : 0.5056          
##     P-Value [Acc > NIR] : 2.597e-07       
##                                           
##                   Kappa : 0.2288          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.4611        0.54545        0.11236        0.7183
## Specificity                0.8023        0.96571        0.97947        0.4688
## Pos Pred Value             0.5850        0.31250        0.35088        0.5803
## Neg Pred Value             0.7112        0.98673        0.91784        0.6194
## Prevalence                 0.3768        0.02778        0.08990        0.5056
## Detection Rate             0.1737        0.01515        0.01010        0.3631
## Detection Prevalence       0.2970        0.04848        0.02879        0.6258
## Balanced Accuracy          0.6317        0.75558        0.54591        0.5936
prediction_naivebayes_test <- predict(modfit.naivebayes, test, type="class")
confusion_matrix_naivebayes_test <- confusionMatrix(prediction_naivebayes_test, test$New.Provider)
confusion_matrix_naivebayes_test
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other      85       0       4     41
##    Aramark     3       9       7      6
##    Compass     3       0       6      4
##    Sodexo     96       5      27    200
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6048          
##                  95% CI : (0.5603, 0.6481)
##     No Information Rate : 0.506           
##     P-Value [Acc > NIR] : 6.144e-06       
##                                           
##                   Kappa : 0.2979          
##                                           
##  Mcnemar's Test P-Value : 6.262e-09       
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.4545        0.64286        0.13636        0.7968
## Specificity                0.8544        0.96680        0.98451        0.4776
## Pos Pred Value             0.6538        0.36000        0.46154        0.6098
## Neg Pred Value             0.7213        0.98938        0.92133        0.6964
## Prevalence                 0.3770        0.02823        0.08871        0.5060
## Detection Rate             0.1714        0.01815        0.01210        0.4032
## Detection Prevalence       0.2621        0.05040        0.02621        0.6613
## Balanced Accuracy          0.6545        0.80483        0.56044        0.6372
## SVM

prediction_svm_train <- predict(modfit.svm, train, type="class")
confusion_matrix_svm_train <- confusionMatrix(prediction_svm_train, train$New.Provider)
confusion_matrix_svm_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     389       4      30    197
##    Aramark     9      20       4      7
##    Compass     7       3      36      5
##    Sodexo    341      28     108    792
## 
## Overall Statistics
##                                          
##                Accuracy : 0.6247         
##                  95% CI : (0.603, 0.6461)
##     No Information Rate : 0.5056         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.324          
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.5214        0.36364        0.20225        0.7912
## Specificity                0.8128        0.98961        0.99168        0.5128
## Pos Pred Value             0.6274        0.50000        0.70588        0.6241
## Neg Pred Value             0.7375        0.98196        0.92639        0.7060
## Prevalence                 0.3768        0.02778        0.08990        0.5056
## Detection Rate             0.1965        0.01010        0.01818        0.4000
## Detection Prevalence       0.3131        0.02020        0.02576        0.6409
## Balanced Accuracy          0.6671        0.67662        0.59696        0.6520
prediction_svm_test <- predict(modfit.svm, test, type="class")
confusion_matrix_svm_test <- confusionMatrix(prediction_svm_test, test$New.Provider)
confusion_matrix_svm_test
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other      95       1       9     40
##    Aramark     0       9       3      0
##    Compass     2       0       9      3
##    Sodexo     90       4      23    208
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6472          
##                  95% CI : (0.6033, 0.6893)
##     No Information Rate : 0.506           
##     P-Value [Acc > NIR] : 1.588e-10       
##                                           
##                   Kappa : 0.3643          
##                                           
##  Mcnemar's Test P-Value : 1.812e-08       
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.5080        0.64286        0.20455        0.8287
## Specificity                0.8382        0.99378        0.98894        0.5224
## Pos Pred Value             0.6552        0.75000        0.64286        0.6400
## Neg Pred Value             0.7379        0.98967        0.92739        0.7485
## Prevalence                 0.3770        0.02823        0.08871        0.5060
## Detection Rate             0.1915        0.01815        0.01815        0.4194
## Detection Prevalence       0.2923        0.02419        0.02823        0.6552
## Balanced Accuracy          0.6731        0.81832        0.59674        0.6756

Building Ensemble

#RF

ens_input_rf_train_predict <- predict(modfit.rf, train, type="prob")
colnames(ens_input_rf_train_predict)<-paste(colnames(ens_input_rf_train_predict),"rf",sep="_")

#GBM
ens_input_gbm_train_predict<- predict(modfit.gbm,train, type="prob")
colnames(ens_input_gbm_train_predict)<-paste(colnames(ens_input_gbm_train_predict),"gbm",sep="_")

#Mlogit
ens_input_ml_train_predict <- predict(modfit.mlogit, train, type="prob")
colnames(ens_input_ml_train_predict)<-paste(colnames(ens_input_ml_train_predict),"ml",sep="_")

# Naive Bayes

ens_input_nb_train_predict <- predict(modfit.naivebayes, train, type="raw")
colnames(ens_input_nb_train_predict)<-paste(colnames(ens_input_nb_train_predict),"nb",sep="_")

# SVM

ens_input_svm_train_predict <- predict(modfit.svm, train,decision.values = FALSE,  probability = TRUE)
ens_input_svm_train_predict<- attr(ens_input_svm_train_predict, "probabilities")
colnames(ens_input_svm_train_predict)<-paste(colnames(ens_input_svm_train_predict),"svm",sep="_")

# Ensemble with main data and predictions
ens_input <- cbind( train,ens_input_rf_train_predict,
                    ens_input_gbm_train_predict,
                    ens_input_ml_train_predict,
                    ens_input_nb_train_predict,
                    ens_input_svm_train_predict
)
modfit.rf.ens <- randomForest(New.Provider ~. , data=ens_input)

# Ensemble with predictions
New.Provider<- train$New.Provider
ens_input_new <- cbind( New.Provider,ens_input_rf_train_predict,
                        ens_input_gbm_train_predict,
                        ens_input_ml_train_predict,
                        ens_input_nb_train_predict,
                        ens_input_svm_train_predict
)
modfit.rf.ens_new <- randomForest(New.Provider ~. , data=ens_input_new)


## Functions
ens_prediction <- function(input){
  #RF
  
  ens_input_rf_predict <- predict(modfit.rf, input, type="prob")
  colnames(ens_input_rf_predict)<-paste(colnames(ens_input_rf_predict),"rf",sep="_")
  
  #GBM
  ens_input_gbm_predict<- predict(modfit.gbm,input, type="prob")
  colnames(ens_input_gbm_predict)<-paste(colnames(ens_input_gbm_predict),"gbm",sep="_")
  
  #Mlogit
  ens_input_ml_predict <- predict(modfit.mlogit, input, type="prob")
  colnames(ens_input_ml_predict)<-paste(colnames(ens_input_ml_predict),"ml",sep="_")
  
  # Naive Bayes
  
  ens_input_nb_train_predict <- predict(modfit.naivebayes, input, type="raw")
  colnames(ens_input_nb_train_predict)<-paste(colnames(ens_input_nb_train_predict),"nb",sep="_")
  
  # SVM
  
  ens_input_svm_predict <- predict(modfit.svm, input,decision.values = FALSE,  probability = TRUE)
  ens_input_svm_predict<- attr(ens_input_svm_predict, "probabilities")
  colnames(ens_input_svm_predict)<-paste(colnames(ens_input_svm_predict),"svm",sep="_")
  
  ens_input <- cbind( input,ens_input_rf_predict,
                      ens_input_gbm_predict,
                      ens_input_ml_predict,
                      ens_input_nb_train_predict,
                      ens_input_svm_predict
  )
  prediction_rf_ens <- predict(modfit.rf.ens, ens_input, type="class")
  return(prediction_rf_ens)
}

ens_new_prediction <- function(input){
  #RF
  
  ens_input_rf_predict <- predict(modfit.rf, input, type="prob")
  colnames(ens_input_rf_predict)<-paste(colnames(ens_input_rf_predict),"rf",sep="_")
  
  #GBM
  ens_input_gbm_predict<- predict(modfit.gbm,input, type="prob")
  colnames(ens_input_gbm_predict)<-paste(colnames(ens_input_gbm_predict),"gbm",sep="_")
  
  #Mlogit
  ens_input_ml_predict <- predict(modfit.mlogit, input, type="prob")
  colnames(ens_input_ml_predict)<-paste(colnames(ens_input_ml_predict),"ml",sep="_")
  
  # Naive Bayes
  
  ens_input_nb_train_predict <- predict(modfit.naivebayes, input, type="raw")
  colnames(ens_input_nb_train_predict)<-paste(colnames(ens_input_nb_train_predict),"nb",sep="_")
  
  # SVM
  
  ens_input_svm_predict <- predict(modfit.svm, input,decision.values = FALSE,  probability = TRUE)
  ens_input_svm_predict<- attr(ens_input_svm_predict, "probabilities")
  colnames(ens_input_svm_predict)<-paste(colnames(ens_input_svm_predict),"svm",sep="_")
  
  ens_input <- cbind( ens_input_rf_predict,
                      ens_input_gbm_predict,
                      ens_input_ml_predict,
                      ens_input_nb_train_predict,
                      ens_input_svm_predict
  )
  prediction_rf_ens <- predict(modfit.rf.ens_new, ens_input, type="class")
  return(prediction_rf_ens)
}

Ensamble Confusion Matrices

confusionMatrix(ens_prediction(train), train$New.Provider)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     746       0       2      3
##    Aramark     0      55       0      0
##    Compass     0       0     176      0
##    Sodexo      0       0       0    998
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9975          
##                  95% CI : (0.9941, 0.9992)
##     No Information Rate : 0.5056          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9957          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                1.0000        1.00000        0.98876        0.9970
## Specificity                0.9959        1.00000        1.00000        1.0000
## Pos Pred Value             0.9933        1.00000        1.00000        1.0000
## Neg Pred Value             1.0000        1.00000        0.99889        0.9969
## Prevalence                 0.3768        0.02778        0.08990        0.5056
## Detection Rate             0.3768        0.02778        0.08889        0.5040
## Detection Prevalence       0.3793        0.02778        0.08889        0.5040
## Balanced Accuracy          0.9980        1.00000        0.99438        0.9985
confusionMatrix(ens_prediction(test), test$New.Provider)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     126       2      12     45
##    Aramark     0       9       4      0
##    Compass     6       0      24      8
##    Sodexo     55       3       4    198
## 
## Overall Statistics
##                                          
##                Accuracy : 0.7198         
##                  95% CI : (0.678, 0.7589)
##     No Information Rate : 0.506          
##     P-Value [Acc > NIR] : < 2e-16        
##                                          
##                   Kappa : 0.5222         
##                                          
##  Mcnemar's Test P-Value : 0.03804        
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.6738        0.64286        0.54545        0.7888
## Specificity                0.8091        0.99170        0.96903        0.7469
## Pos Pred Value             0.6811        0.69231        0.63158        0.7615
## Neg Pred Value             0.8039        0.98965        0.95633        0.7754
## Prevalence                 0.3770        0.02823        0.08871        0.5060
## Detection Rate             0.2540        0.01815        0.04839        0.3992
## Detection Prevalence       0.3730        0.02621        0.07661        0.5242
## Balanced Accuracy          0.7414        0.81728        0.75724        0.7679
confusionMatrix(ens_new_prediction(train), train$New.Provider)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     746       0       2      3
##    Aramark     0      55       0      0
##    Compass     0       0     176      0
##    Sodexo      0       0       0    998
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9975          
##                  95% CI : (0.9941, 0.9992)
##     No Information Rate : 0.5056          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9957          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                1.0000        1.00000        0.98876        0.9970
## Specificity                0.9959        1.00000        1.00000        1.0000
## Pos Pred Value             0.9933        1.00000        1.00000        1.0000
## Neg Pred Value             1.0000        1.00000        0.99889        0.9969
## Prevalence                 0.3768        0.02778        0.08990        0.5056
## Detection Rate             0.3768        0.02778        0.08889        0.5040
## Detection Prevalence       0.3793        0.02778        0.08889        0.5040
## Balanced Accuracy          0.9980        1.00000        0.99438        0.9985
confusionMatrix(ens_new_prediction(test), test$New.Provider)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     126       2      12     43
##    Aramark     0       9       4      0
##    Compass     6       0      25      8
##    Sodexo     55       3       3    200
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7258          
##                  95% CI : (0.6843, 0.7646)
##     No Information Rate : 0.506           
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.5328          
##                                           
##  Mcnemar's Test P-Value : 0.02236         
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.6738        0.64286        0.56818        0.7968
## Specificity                0.8155        0.99170        0.96903        0.7510
## Pos Pred Value             0.6885        0.69231        0.64103        0.7663
## Neg Pred Value             0.8051        0.98965        0.95842        0.7830
## Prevalence                 0.3770        0.02823        0.08871        0.5060
## Detection Rate             0.2540        0.01815        0.05040        0.4032
## Detection Prevalence       0.3690        0.02621        0.07863        0.5262
## Balanced Accuracy          0.7447        0.81728        0.76860        0.7739

Analysis With 16 Strategic Emphasis variables

Data Setup

str_emphasis_names <- colnames(Startegic_Emphasis_Data)
str_emphasis_names <-str_emphasis_names[str_emphasis_names != "Fiscal Year Sodexo"]

all_variables <- c(outcome, variables,str_emphasis_names)

train_main$`Fiscal Year Sodexo` <- as.double(train_main$`Fiscal Year Sodexo` )
train_main_new <- full_join(train_main,  Startegic_Emphasis_Data, by = "Fiscal Year Sodexo")
train_main_new <- full_join(train_main_new,  Startegic_Emphasis_Data_Combined, by = "Fiscal Year Sodexo")


test_main$`Fiscal Year Sodexo` <- as.double(test_main$`Fiscal Year Sodexo` )
test_main_new <- full_join(test_main,  Startegic_Emphasis_Data, by = "Fiscal Year Sodexo")
test_main_new <- full_join(test_main_new,  Startegic_Emphasis_Data_Combined, by = "Fiscal Year Sodexo")

outcome <- "New Provider"
variables <- c("Old Provider","Opportunity Global Region","Service Level 1","Strategic Segment L1","Regional Strategic Account","Revenue (Annual) (converted) in Mill Euros","Margin (GP/BR)" )




train <- train_main_new %>% dplyr::select(all_of(all_variables))
test <- test_main_new %>%  dplyr::select(all_of(all_variables))
colnames(train) <- c("New.Provider","Old.Provider","Opportunity.Global.Region","Service.Level.1","Strategic.Segment.L1","Regional.Strategic.Account","Revenue.Mill.Euros","Margin",str_emphasis_names )
colnames(test) <- c("New.Provider","Old.Provider","Opportunity.Global.Region","Service.Level.1","Strategic.Segment.L1","Regional.Strategic.Account","Revenue.Mill.Euros","Margin",str_emphasis_names )

Random Forest

library(randomForest)
modfit.rf <- randomForest(New.Provider ~. , data=train)

GBM

## GBM
library(caret)
myControl <- trainControl(method='cv', number=5, returnResamp='none')

labelName <- 'New.Provider'
predictors <- names(train)[names(train) != labelName]


modfit.gbm <- train(train[,predictors], train[,labelName], method='gbm', trControl=myControl)

Multinomial Logit

f <- as.formula(paste(labelName, paste(predictors, collapse = " + "),sep = "~"))
modfit.mlogit  <- multinom(f, data =  train)

Naive Bayes

library(e1071)

modfit.naivebayes <- e1071::naiveBayes(New.Provider ~ ., data = train)

SVM

modfit.svm <- e1071::svm(New.Provider ~ ., data = train,probability = TRUE)

Confusion Matrices

#RF

prediction_rf_train <- predict(modfit.rf, train, type="class")
confusion_matrix_rf_train <- confusionMatrix(prediction_rf_train, train$New.Provider)
confusion_matrix_rf_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     444       4      61    155
##    Aramark     0       9       0      0
##    Compass     0       0       0      0
##    Sodexo    302      42     117    846
## 
## Overall Statistics
##                                          
##                Accuracy : 0.6561         
##                  95% CI : (0.6347, 0.677)
##     No Information Rate : 0.5056         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.3628         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.5952       0.163636         0.0000        0.8452
## Specificity                0.8217       1.000000         1.0000        0.5291
## Pos Pred Value             0.6687       1.000000            NaN        0.6473
## Neg Pred Value             0.7705       0.976662         0.9101        0.7697
## Prevalence                 0.3768       0.027778         0.0899        0.5056
## Detection Rate             0.2242       0.004545         0.0000        0.4273
## Detection Prevalence       0.3354       0.004545         0.0000        0.6601
## Balanced Accuracy          0.7084       0.581818         0.5000        0.6871
prediction_rf_test <- predict(modfit.rf, test, type="class")
confusion_matrix_rf_test <- confusionMatrix(prediction_rf_test, test$New.Provider)
confusion_matrix_rf_test
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     120       1      16     39
##    Aramark     0       0       0      0
##    Compass     0       0       0      0
##    Sodexo     67      13      28    212
## 
## Overall Statistics
##                                          
##                Accuracy : 0.6694         
##                  95% CI : (0.626, 0.7106)
##     No Information Rate : 0.506          
##     P-Value [Acc > NIR] : 1.394e-13      
##                                          
##                   Kappa : 0.3874         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.6417        0.00000        0.00000        0.8446
## Specificity                0.8188        1.00000        1.00000        0.5592
## Pos Pred Value             0.6818            NaN            NaN        0.6625
## Neg Pred Value             0.7906        0.97177        0.91129        0.7784
## Prevalence                 0.3770        0.02823        0.08871        0.5060
## Detection Rate             0.2419        0.00000        0.00000        0.4274
## Detection Prevalence       0.3548        0.00000        0.00000        0.6452
## Balanced Accuracy          0.7302        0.50000        0.50000        0.7019
# GBM
train_preds <- as.data.frame(predict(object=modfit.gbm, train[,predictors]))
confusion_matrix_gbm_train <- confusionMatrix(train_preds$`predict(object = modfit.gbm, train[, predictors])`, train$New.Provider)
confusion_matrix_gbm_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     584       0      22    119
##    Aramark     0      46       1      3
##    Compass    12       0     118      9
##    Sodexo    150       9      37    870
## 
## Overall Statistics
##                                          
##                Accuracy : 0.8172         
##                  95% CI : (0.7994, 0.834)
##     No Information Rate : 0.5056         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.6863         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.7828        0.83636         0.6629        0.8691
## Specificity                0.8857        0.99792         0.9883        0.7998
## Pos Pred Value             0.8055        0.92000         0.8489        0.8161
## Neg Pred Value             0.8709        0.99534         0.9674        0.8567
## Prevalence                 0.3768        0.02778         0.0899        0.5056
## Detection Rate             0.2949        0.02323         0.0596        0.4394
## Detection Prevalence       0.3662        0.02525         0.0702        0.5384
## Balanced Accuracy          0.8343        0.91714         0.8256        0.8345
test_preds <- as.data.frame(predict(object=modfit.gbm, test[,predictors]))

confusion_matrix_gbm_test <- confusionMatrix(test_preds$`predict(object = modfit.gbm, test[, predictors])`, test$New.Provider)
confusion_matrix_gbm_test
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     139       0      10     39
##    Aramark     0      11       2      1
##    Compass     5       0      19      7
##    Sodexo     43       3      13    204
## 
## Overall Statistics
##                                           
##                Accuracy : 0.752           
##                  95% CI : (0.7116, 0.7894)
##     No Information Rate : 0.506           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5742          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.7433        0.78571        0.43182        0.8127
## Specificity                0.8414        0.99378        0.97345        0.7592
## Pos Pred Value             0.7394        0.78571        0.61290        0.7757
## Neg Pred Value             0.8442        0.99378        0.94624        0.7983
## Prevalence                 0.3770        0.02823        0.08871        0.5060
## Detection Rate             0.2802        0.02218        0.03831        0.4113
## Detection Prevalence       0.3790        0.02823        0.06250        0.5302
## Balanced Accuracy          0.7924        0.88975        0.70263        0.7860
# MLogit

prediction_mlogit_train <- predict(modfit.mlogit, train, type="class")
confusion_matrix_mlogit_train <- confusionMatrix(prediction_mlogit_train, train$New.Provider)
confusion_matrix_mlogit_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     376       7      31    185
##    Aramark     8      22       4      6
##    Compass    15       5      57     30
##    Sodexo    347      21      86    780
## 
## Overall Statistics
##                                          
##                Accuracy : 0.6237         
##                  95% CI : (0.602, 0.6451)
##     No Information Rate : 0.5056         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.3347         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.5040        0.40000        0.32022        0.7792
## Specificity                0.8193        0.99065        0.97225        0.5363
## Pos Pred Value             0.6277        0.55000        0.53271        0.6321
## Neg Pred Value             0.7321        0.98299        0.93540        0.7038
## Prevalence                 0.3768        0.02778        0.08990        0.5056
## Detection Rate             0.1899        0.01111        0.02879        0.3939
## Detection Prevalence       0.3025        0.02020        0.05404        0.6232
## Balanced Accuracy          0.6617        0.69532        0.64624        0.6577
prediction_mlogit_test <- predict(modfit.mlogit, test, type="class")
confusion_matrix_mlogit_test <- confusionMatrix(prediction_mlogit_test, test$New.Provider)
confusion_matrix_mlogit_test
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     102       2       8     39
##    Aramark     1       8       3      0
##    Compass     5       0      13     11
##    Sodexo     79       4      20    201
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6532          
##                  95% CI : (0.6095, 0.6951)
##     No Information Rate : 0.506           
##     P-Value [Acc > NIR] : 2.588e-11       
##                                           
##                   Kappa : 0.3908          
##                                           
##  Mcnemar's Test P-Value : 0.0004802       
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.5455        0.57143        0.29545        0.8008
## Specificity                0.8414        0.99170        0.96460        0.5796
## Pos Pred Value             0.6755        0.66667        0.44828        0.6612
## Neg Pred Value             0.7536        0.98760        0.93362        0.7396
## Prevalence                 0.3770        0.02823        0.08871        0.5060
## Detection Rate             0.2056        0.01613        0.02621        0.4052
## Detection Prevalence       0.3044        0.02419        0.05847        0.6129
## Balanced Accuracy          0.6934        0.78156        0.63003        0.6902
# Naive Bayes

prediction_naivebayes_train <- predict(modfit.naivebayes, train, type="class")
confusion_matrix_naivebayes_train <- confusionMatrix(prediction_naivebayes_train, train$New.Provider)
confusion_matrix_naivebayes_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     277       3      13    141
##    Aramark   106      37      21    196
##    Compass   156      15     113    241
##    Sodexo    207       0      31    423
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4293          
##                  95% CI : (0.4074, 0.4514)
##     No Information Rate : 0.5056          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2071          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.3713        0.67273        0.63483        0.4226
## Specificity                0.8728        0.83221        0.77137        0.7569
## Pos Pred Value             0.6382        0.10278        0.21524        0.6399
## Neg Pred Value             0.6966        0.98889        0.95533        0.5618
## Prevalence                 0.3768        0.02778        0.08990        0.5056
## Detection Rate             0.1399        0.01869        0.05707        0.2136
## Detection Prevalence       0.2192        0.18182        0.26515        0.3338
## Balanced Accuracy          0.6220        0.75247        0.70310        0.5897
prediction_naivebayes_test <- predict(modfit.naivebayes, test, type="class")
confusion_matrix_naivebayes_test <- confusionMatrix(prediction_naivebayes_test, test$New.Provider)
confusion_matrix_naivebayes_test
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other      77       0       3     33
##    Aramark    18       7       5     61
##    Compass    40       7      30     57
##    Sodexo     52       0       6    100
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4315          
##                  95% CI : (0.3874, 0.4763)
##     No Information Rate : 0.506           
##     P-Value [Acc > NIR] : 0.9996          
##                                           
##                   Kappa : 0.2145          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.4118        0.50000        0.68182        0.3984
## Specificity                0.8835        0.82573        0.76991        0.7633
## Pos Pred Value             0.6814        0.07692        0.22388        0.6329
## Neg Pred Value             0.7128        0.98272        0.96133        0.5533
## Prevalence                 0.3770        0.02823        0.08871        0.5060
## Detection Rate             0.1552        0.01411        0.06048        0.2016
## Detection Prevalence       0.2278        0.18347        0.27016        0.3185
## Balanced Accuracy          0.6476        0.66286        0.72586        0.5808
## SVM

prediction_svm_train <- predict(modfit.svm, train, type="class")
confusion_matrix_svm_train <- confusionMatrix(prediction_svm_train, train$New.Provider)
confusion_matrix_svm_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     452       8      46    172
##    Aramark     0       0       0      0
##    Compass     0       0       5      0
##    Sodexo    294      47     127    829
## 
## Overall Statistics
##                                          
##                Accuracy : 0.6495         
##                  95% CI : (0.628, 0.6705)
##     No Information Rate : 0.5056         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.3504         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.6059        0.00000       0.028090        0.8282
## Specificity                0.8169        1.00000       1.000000        0.5220
## Pos Pred Value             0.6667            NaN       1.000000        0.6392
## Neg Pred Value             0.7742        0.97222       0.912405        0.7482
## Prevalence                 0.3768        0.02778       0.089899        0.5056
## Detection Rate             0.2283        0.00000       0.002525        0.4187
## Detection Prevalence       0.3424        0.00000       0.002525        0.6551
## Balanced Accuracy          0.7114        0.50000       0.514045        0.6751
prediction_svm_test <- predict(modfit.svm, test, type="class")
confusion_matrix_svm_test <- confusionMatrix(prediction_svm_test, test$New.Provider)
confusion_matrix_svm_test
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     111       1      11     41
##    Aramark     0       0       0      0
##    Compass     0       0       1      0
##    Sodexo     76      13      32    210
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6492          
##                  95% CI : (0.6054, 0.6912)
##     No Information Rate : 0.506           
##     P-Value [Acc > NIR] : 8.75e-11        
##                                           
##                   Kappa : 0.3473          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.5936        0.00000       0.022727        0.8367
## Specificity                0.8285        1.00000       1.000000        0.5061
## Pos Pred Value             0.6768            NaN       1.000000        0.6344
## Neg Pred Value             0.7711        0.97177       0.913131        0.7515
## Prevalence                 0.3770        0.02823       0.088710        0.5060
## Detection Rate             0.2238        0.00000       0.002016        0.4234
## Detection Prevalence       0.3306        0.00000       0.002016        0.6673
## Balanced Accuracy          0.7110        0.50000       0.511364        0.6714

Building Ensemble

#RF

ens_input_rf_train_predict <- predict(modfit.rf, train, type="prob")
colnames(ens_input_rf_train_predict)<-paste(colnames(ens_input_rf_train_predict),"rf",sep="_")

#GBM
ens_input_gbm_train_predict<- predict(modfit.gbm,train, type="prob")
colnames(ens_input_gbm_train_predict)<-paste(colnames(ens_input_gbm_train_predict),"gbm",sep="_")

#Mlogit
ens_input_ml_train_predict <- predict(modfit.mlogit, train, type="prob")
colnames(ens_input_ml_train_predict)<-paste(colnames(ens_input_ml_train_predict),"ml",sep="_")

# Naive Bayes

ens_input_nb_train_predict <- predict(modfit.naivebayes, train, type="raw")
colnames(ens_input_nb_train_predict)<-paste(colnames(ens_input_nb_train_predict),"nb",sep="_")

# SVM

ens_input_svm_train_predict <- predict(modfit.svm, train,decision.values = FALSE,  probability = TRUE)
ens_input_svm_train_predict<- attr(ens_input_svm_train_predict, "probabilities")
colnames(ens_input_svm_train_predict)<-paste(colnames(ens_input_svm_train_predict),"svm",sep="_")

# Ensemble with main data and predictions
ens_input <- cbind( train,ens_input_rf_train_predict,
                    ens_input_gbm_train_predict,
                    ens_input_ml_train_predict,
                    ens_input_nb_train_predict,
                    ens_input_svm_train_predict
)
modfit.rf.ens <- randomForest(New.Provider ~. , data=ens_input)

# Ensemble with predictions
New.Provider<- train$New.Provider
ens_input_new <- cbind( New.Provider,ens_input_rf_train_predict,
                        ens_input_gbm_train_predict,
                        ens_input_ml_train_predict,
                        ens_input_nb_train_predict,
                        ens_input_svm_train_predict
)
modfit.rf.ens_new <- randomForest(New.Provider ~. , data=ens_input_new)


## Functions
ens_prediction <- function(input){
  #RF
  
  ens_input_rf_predict <- predict(modfit.rf, input, type="prob")
  colnames(ens_input_rf_predict)<-paste(colnames(ens_input_rf_predict),"rf",sep="_")
  
  #GBM
  ens_input_gbm_predict<- predict(modfit.gbm,input, type="prob")
  colnames(ens_input_gbm_predict)<-paste(colnames(ens_input_gbm_predict),"gbm",sep="_")
  
  #Mlogit
  ens_input_ml_predict <- predict(modfit.mlogit, input, type="prob")
  colnames(ens_input_ml_predict)<-paste(colnames(ens_input_ml_predict),"ml",sep="_")
  
  # Naive Bayes
  
  ens_input_nb_train_predict <- predict(modfit.naivebayes, input, type="raw")
  colnames(ens_input_nb_train_predict)<-paste(colnames(ens_input_nb_train_predict),"nb",sep="_")
  
  # SVM
  
  ens_input_svm_predict <- predict(modfit.svm, input,decision.values = FALSE,  probability = TRUE)
  ens_input_svm_predict<- attr(ens_input_svm_predict, "probabilities")
  colnames(ens_input_svm_predict)<-paste(colnames(ens_input_svm_predict),"svm",sep="_")
  
  ens_input <- cbind( input,ens_input_rf_predict,
                      ens_input_gbm_predict,
                      ens_input_ml_predict,
                      ens_input_nb_train_predict,
                      ens_input_svm_predict
  )
  prediction_rf_ens <- predict(modfit.rf.ens, ens_input, type="class")
  return(prediction_rf_ens)
}

ens_new_prediction <- function(input){
  #RF
  
  ens_input_rf_predict <- predict(modfit.rf, input, type="prob")
  colnames(ens_input_rf_predict)<-paste(colnames(ens_input_rf_predict),"rf",sep="_")
  
  #GBM
  ens_input_gbm_predict<- predict(modfit.gbm,input, type="prob")
  colnames(ens_input_gbm_predict)<-paste(colnames(ens_input_gbm_predict),"gbm",sep="_")
  
  #Mlogit
  ens_input_ml_predict <- predict(modfit.mlogit, input, type="prob")
  colnames(ens_input_ml_predict)<-paste(colnames(ens_input_ml_predict),"ml",sep="_")
  
  # Naive Bayes
  
  ens_input_nb_train_predict <- predict(modfit.naivebayes, input, type="raw")
  colnames(ens_input_nb_train_predict)<-paste(colnames(ens_input_nb_train_predict),"nb",sep="_")
  
  # SVM
  
  ens_input_svm_predict <- predict(modfit.svm, input,decision.values = FALSE,  probability = TRUE)
  ens_input_svm_predict<- attr(ens_input_svm_predict, "probabilities")
  colnames(ens_input_svm_predict)<-paste(colnames(ens_input_svm_predict),"svm",sep="_")
  
  ens_input <- cbind( ens_input_rf_predict,
                      ens_input_gbm_predict,
                      ens_input_ml_predict,
                      ens_input_nb_train_predict,
                      ens_input_svm_predict
  )
  prediction_rf_ens <- predict(modfit.rf.ens_new, ens_input, type="class")
  return(prediction_rf_ens)
}

Ensamble Confusion Matrices

confusionMatrix(ens_prediction(train), train$New.Provider)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     746       0       1      3
##    Aramark     0      55       0      0
##    Compass     0       0     177      0
##    Sodexo      0       0       0    998
## 
## Overall Statistics
##                                           
##                Accuracy : 0.998           
##                  95% CI : (0.9948, 0.9994)
##     No Information Rate : 0.5056          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9966          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                1.0000        1.00000        0.99438        0.9970
## Specificity                0.9968        1.00000        1.00000        1.0000
## Pos Pred Value             0.9947        1.00000        1.00000        1.0000
## Neg Pred Value             1.0000        1.00000        0.99945        0.9969
## Prevalence                 0.3768        0.02778        0.08990        0.5056
## Detection Rate             0.3768        0.02778        0.08939        0.5040
## Detection Prevalence       0.3788        0.02778        0.08939        0.5040
## Balanced Accuracy          0.9984        1.00000        0.99719        0.9985
confusionMatrix(ens_prediction(test), test$New.Provider)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     138       1       8     36
##    Aramark     0      11       0      1
##    Compass     3       0      31      5
##    Sodexo     46       2       5    209
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7843          
##                  95% CI : (0.7454, 0.8197)
##     No Information Rate : 0.506           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6318          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.7380        0.78571        0.70455        0.8327
## Specificity                0.8544        0.99793        0.98230        0.7837
## Pos Pred Value             0.7541        0.91667        0.79487        0.7977
## Neg Pred Value             0.8435        0.99380        0.97155        0.8205
## Prevalence                 0.3770        0.02823        0.08871        0.5060
## Detection Rate             0.2782        0.02218        0.06250        0.4214
## Detection Prevalence       0.3690        0.02419        0.07863        0.5282
## Balanced Accuracy          0.7962        0.89182        0.84342        0.8082
confusionMatrix(ens_new_prediction(train), train$New.Provider)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     746       0       1      3
##    Aramark     0      55       0      0
##    Compass     0       0     177      0
##    Sodexo      0       0       0    998
## 
## Overall Statistics
##                                           
##                Accuracy : 0.998           
##                  95% CI : (0.9948, 0.9994)
##     No Information Rate : 0.5056          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9966          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                1.0000        1.00000        0.99438        0.9970
## Specificity                0.9968        1.00000        1.00000        1.0000
## Pos Pred Value             0.9947        1.00000        1.00000        1.0000
## Neg Pred Value             1.0000        1.00000        0.99945        0.9969
## Prevalence                 0.3768        0.02778        0.08990        0.5056
## Detection Rate             0.3768        0.02778        0.08939        0.5040
## Detection Prevalence       0.3788        0.02778        0.08939        0.5040
## Balanced Accuracy          0.9984        1.00000        0.99719        0.9985
confusionMatrix(ens_new_prediction(test), test$New.Provider)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     140       0       8     34
##    Aramark     0      11       0      1
##    Compass     4       0      30      5
##    Sodexo     43       3       6    211
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7903          
##                  95% CI : (0.7518, 0.8253)
##     No Information Rate : 0.506           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.642           
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.7487        0.78571        0.68182        0.8406
## Specificity                0.8641        0.99793        0.98009        0.7878
## Pos Pred Value             0.7692        0.91667        0.76923        0.8023
## Neg Pred Value             0.8503        0.99380        0.96937        0.8283
## Prevalence                 0.3770        0.02823        0.08871        0.5060
## Detection Rate             0.2823        0.02218        0.06048        0.4254
## Detection Prevalence       0.3669        0.02419        0.07863        0.5302
## Balanced Accuracy          0.8064        0.89182        0.83095        0.8142

Analysis With 8 Strategic Emphasis variables

Data Setup

str_emphasis_names <- colnames(Startegic_Emphasis_Data_Combined)
str_emphasis_names <-str_emphasis_names[str_emphasis_names != "Fiscal Year Sodexo"]

all_variables <- c(outcome, variables,str_emphasis_names)

train_main$`Fiscal Year Sodexo` <- as.double(train_main$`Fiscal Year Sodexo` )
train_main_new <- full_join(train_main,  Startegic_Emphasis_Data, by = "Fiscal Year Sodexo")
train_main_new <- full_join(train_main_new,  Startegic_Emphasis_Data_Combined, by = "Fiscal Year Sodexo")


test_main$`Fiscal Year Sodexo` <- as.double(test_main$`Fiscal Year Sodexo` )
test_main_new <- full_join(test_main,  Startegic_Emphasis_Data, by = "Fiscal Year Sodexo")
test_main_new <- full_join(test_main_new,  Startegic_Emphasis_Data_Combined, by = "Fiscal Year Sodexo")

outcome <- "New Provider"
variables <- c("Old Provider","Opportunity Global Region","Service Level 1","Strategic Segment L1","Regional Strategic Account","Revenue (Annual) (converted) in Mill Euros","Margin (GP/BR)" )




train <- train_main_new %>% dplyr::select(all_of(all_variables))
test <- test_main_new %>%  dplyr::select(all_of(all_variables))
colnames(train) <- c("New.Provider","Old.Provider","Opportunity.Global.Region","Service.Level.1","Strategic.Segment.L1","Regional.Strategic.Account","Revenue.Mill.Euros","Margin",str_emphasis_names )
colnames(test) <- c("New.Provider","Old.Provider","Opportunity.Global.Region","Service.Level.1","Strategic.Segment.L1","Regional.Strategic.Account","Revenue.Mill.Euros","Margin",str_emphasis_names )

Random Forest

library(randomForest)
modfit.rf <- randomForest(New.Provider ~. , data=train)

GBM

## GBM
library(caret)
myControl <- trainControl(method='cv', number=5, returnResamp='none')

labelName <- 'New.Provider'
predictors <- names(train)[names(train) != labelName]


modfit.gbm <- train(train[,predictors], train[,labelName], method='gbm', trControl=myControl)

Multinomial Logit

f <- as.formula(paste(labelName, paste(predictors, collapse = " + "),sep = "~"))
modfit.mlogit  <- multinom(f, data =  train)

Naive Bayes

library(e1071)

modfit.naivebayes <- e1071::naiveBayes(New.Provider ~ ., data = train)

SVM

modfit.svm <- e1071::svm(New.Provider ~ ., data = train,probability = TRUE)

Confusion Matrices

#RF

prediction_rf_train <- predict(modfit.rf, train, type="class")
confusion_matrix_rf_train <- confusionMatrix(prediction_rf_train, train$New.Provider)
confusion_matrix_rf_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     540       6      49    102
##    Aramark     0      18       0      0
##    Compass     1       0      56      0
##    Sodexo    205      31      73    899
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7641          
##                  95% CI : (0.7448, 0.7827)
##     No Information Rate : 0.5056          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5759          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.7239       0.327273        0.31461        0.8981
## Specificity                0.8728       1.000000        0.99945        0.6844
## Pos Pred Value             0.7747       1.000000        0.98246        0.7442
## Neg Pred Value             0.8394       0.981142        0.93656        0.8679
## Prevalence                 0.3768       0.027778        0.08990        0.5056
## Detection Rate             0.2727       0.009091        0.02828        0.4540
## Detection Prevalence       0.3520       0.009091        0.02879        0.6101
## Balanced Accuracy          0.7983       0.663636        0.65703        0.7912
prediction_rf_test <- predict(modfit.rf, test, type="class")
confusion_matrix_rf_test <- confusionMatrix(prediction_rf_test, test$New.Provider)
confusion_matrix_rf_test
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     126       2      13     27
##    Aramark     0       3       0      0
##    Compass     0       1      11      1
##    Sodexo     61       8      20    223
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7319          
##                  95% CI : (0.6906, 0.7704)
##     No Information Rate : 0.506           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5138          
##                                           
##  Mcnemar's Test P-Value : 6.338e-10       
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.6738       0.214286        0.25000        0.8884
## Specificity                0.8641       1.000000        0.99558        0.6367
## Pos Pred Value             0.7500       1.000000        0.84615        0.7147
## Neg Pred Value             0.8140       0.977688        0.93168        0.8478
## Prevalence                 0.3770       0.028226        0.08871        0.5060
## Detection Rate             0.2540       0.006048        0.02218        0.4496
## Detection Prevalence       0.3387       0.006048        0.02621        0.6290
## Balanced Accuracy          0.7689       0.607143        0.62279        0.7626
# GBM
train_preds <- as.data.frame(predict(object=modfit.gbm, train[,predictors]))
confusion_matrix_gbm_train <- confusionMatrix(train_preds$`predict(object = modfit.gbm, train[, predictors])`, train$New.Provider)
confusion_matrix_gbm_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     587       1      24    107
##    Aramark     0      45       1      3
##    Compass    12       1     116     12
##    Sodexo    147       8      37    879
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8217          
##                  95% CI : (0.8041, 0.8383)
##     No Information Rate : 0.5056          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.694           
##                                           
##  Mcnemar's Test P-Value : 0.0001935       
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.7869        0.81818        0.65169        0.8781
## Specificity                0.8930        0.99792        0.98613        0.8039
## Pos Pred Value             0.8164        0.91837        0.82270        0.8207
## Neg Pred Value             0.8739        0.99482        0.96629        0.8658
## Prevalence                 0.3768        0.02778        0.08990        0.5056
## Detection Rate             0.2965        0.02273        0.05859        0.4439
## Detection Prevalence       0.3631        0.02475        0.07121        0.5409
## Balanced Accuracy          0.8399        0.90805        0.81891        0.8410
test_preds <- as.data.frame(predict(object=modfit.gbm, test[,predictors]))

confusion_matrix_gbm_test <- confusionMatrix(test_preds$`predict(object = modfit.gbm, test[, predictors])`, test$New.Provider)
confusion_matrix_gbm_test
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     136       1       9     37
##    Aramark     0      11       3      0
##    Compass     5       0      18      6
##    Sodexo     46       2      14    208
## 
## Overall Statistics
##                                           
##                Accuracy : 0.752           
##                  95% CI : (0.7116, 0.7894)
##     No Information Rate : 0.506           
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.572           
##                                           
##  Mcnemar's Test P-Value : 0.07901         
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.7273        0.78571        0.40909        0.8287
## Specificity                0.8479        0.99378        0.97566        0.7469
## Pos Pred Value             0.7432        0.78571        0.62069        0.7704
## Neg Pred Value             0.8371        0.99378        0.94433        0.8097
## Prevalence                 0.3770        0.02823        0.08871        0.5060
## Detection Rate             0.2742        0.02218        0.03629        0.4194
## Detection Prevalence       0.3690        0.02823        0.05847        0.5444
## Balanced Accuracy          0.7876        0.88975        0.69238        0.7878
# MLogit

prediction_mlogit_train <- predict(modfit.mlogit, train, type="class")
confusion_matrix_mlogit_train <- confusionMatrix(prediction_mlogit_train, train$New.Provider)
confusion_matrix_mlogit_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     376       7      31    185
##    Aramark     8      22       4      6
##    Compass    15       5      57     30
##    Sodexo    347      21      86    780
## 
## Overall Statistics
##                                          
##                Accuracy : 0.6237         
##                  95% CI : (0.602, 0.6451)
##     No Information Rate : 0.5056         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.3347         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.5040        0.40000        0.32022        0.7792
## Specificity                0.8193        0.99065        0.97225        0.5363
## Pos Pred Value             0.6277        0.55000        0.53271        0.6321
## Neg Pred Value             0.7321        0.98299        0.93540        0.7038
## Prevalence                 0.3768        0.02778        0.08990        0.5056
## Detection Rate             0.1899        0.01111        0.02879        0.3939
## Detection Prevalence       0.3025        0.02020        0.05404        0.6232
## Balanced Accuracy          0.6617        0.69532        0.64624        0.6577
prediction_mlogit_test <- predict(modfit.mlogit, test, type="class")
confusion_matrix_mlogit_test <- confusionMatrix(prediction_mlogit_test, test$New.Provider)
confusion_matrix_mlogit_test
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     102       2       8     39
##    Aramark     1       8       3      0
##    Compass     5       0      13     11
##    Sodexo     79       4      20    201
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6532          
##                  95% CI : (0.6095, 0.6951)
##     No Information Rate : 0.506           
##     P-Value [Acc > NIR] : 2.588e-11       
##                                           
##                   Kappa : 0.3908          
##                                           
##  Mcnemar's Test P-Value : 0.0004802       
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.5455        0.57143        0.29545        0.8008
## Specificity                0.8414        0.99170        0.96460        0.5796
## Pos Pred Value             0.6755        0.66667        0.44828        0.6612
## Neg Pred Value             0.7536        0.98760        0.93362        0.7396
## Prevalence                 0.3770        0.02823        0.08871        0.5060
## Detection Rate             0.2056        0.01613        0.02621        0.4052
## Detection Prevalence       0.3044        0.02419        0.05847        0.6129
## Balanced Accuracy          0.6934        0.78156        0.63003        0.6902
# Naive Bayes

prediction_naivebayes_train <- predict(modfit.naivebayes, train, type="class")
confusion_matrix_naivebayes_train <- confusionMatrix(prediction_naivebayes_train, train$New.Provider)
confusion_matrix_naivebayes_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     364       7      39    246
##    Aramark    54      35      15    107
##    Compass    64      11      64    137
##    Sodexo    264       2      60    511
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4919          
##                  95% CI : (0.4697, 0.5142)
##     No Information Rate : 0.5056          
##     P-Value [Acc > NIR] : 0.8918          
##                                           
##                   Kappa : 0.2135          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.4879        0.63636        0.35955        0.5105
## Specificity                0.7634        0.90857        0.88235        0.6670
## Pos Pred Value             0.5549        0.16588        0.23188        0.6105
## Neg Pred Value             0.7115        0.98869        0.93310        0.5713
## Prevalence                 0.3768        0.02778        0.08990        0.5056
## Detection Rate             0.1838        0.01768        0.03232        0.2581
## Detection Prevalence       0.3313        0.10657        0.13939        0.4227
## Balanced Accuracy          0.6257        0.77247        0.62095        0.5887
prediction_naivebayes_test <- predict(modfit.naivebayes, test, type="class")
confusion_matrix_naivebayes_test <- confusionMatrix(prediction_naivebayes_test, test$New.Provider)
confusion_matrix_naivebayes_test
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     100       0      10     49
##    Aramark     9       5       3     35
##    Compass    17       7      20     39
##    Sodexo     61       2      11    128
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5101          
##                  95% CI : (0.4651, 0.5549)
##     No Information Rate : 0.506           
##     P-Value [Acc > NIR] : 0.4465          
##                                           
##                   Kappa : 0.2523          
##                                           
##  Mcnemar's Test P-Value : 7.755e-11       
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.5348        0.35714        0.45455        0.5100
## Specificity                0.8091        0.90249        0.86062        0.6980
## Pos Pred Value             0.6289        0.09615        0.24096        0.6337
## Neg Pred Value             0.7418        0.97973        0.94189        0.5816
## Prevalence                 0.3770        0.02823        0.08871        0.5060
## Detection Rate             0.2016        0.01008        0.04032        0.2581
## Detection Prevalence       0.3206        0.10484        0.16734        0.4073
## Balanced Accuracy          0.6719        0.62982        0.65758        0.6040
## SVM

prediction_svm_train <- predict(modfit.svm, train, type="class")
confusion_matrix_svm_train <- confusionMatrix(prediction_svm_train, train$New.Provider)
confusion_matrix_svm_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     404       8      44    192
##    Aramark     0       0       0      0
##    Compass     6       0      26      5
##    Sodexo    336      47     108    804
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6232          
##                  95% CI : (0.6015, 0.6446)
##     No Information Rate : 0.5056          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3079          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.5416        0.00000        0.14607        0.8032
## Specificity                0.8023        1.00000        0.99390        0.4985
## Pos Pred Value             0.6235            NaN        0.70270        0.6208
## Neg Pred Value             0.7432        0.97222        0.92177        0.7124
## Prevalence                 0.3768        0.02778        0.08990        0.5056
## Detection Rate             0.2040        0.00000        0.01313        0.4061
## Detection Prevalence       0.3273        0.00000        0.01869        0.6540
## Balanced Accuracy          0.6719        0.50000        0.56998        0.6508
prediction_svm_test <- predict(modfit.svm, test, type="class")
confusion_matrix_svm_test <- confusionMatrix(prediction_svm_test, test$New.Provider)
confusion_matrix_svm_test
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     112       1      13     38
##    Aramark     0       0       0      0
##    Compass     0       0       4      1
##    Sodexo     75      13      27    212
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6613          
##                  95% CI : (0.6178, 0.7029)
##     No Information Rate : 0.506           
##     P-Value [Acc > NIR] : 2.038e-12       
##                                           
##                   Kappa : 0.3737          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.5989        0.00000       0.090909        0.8446
## Specificity                0.8317        1.00000       0.997788        0.5306
## Pos Pred Value             0.6829            NaN       0.800000        0.6483
## Neg Pred Value             0.7741        0.97177       0.918534        0.7692
## Prevalence                 0.3770        0.02823       0.088710        0.5060
## Detection Rate             0.2258        0.00000       0.008065        0.4274
## Detection Prevalence       0.3306        0.00000       0.010081        0.6593
## Balanced Accuracy          0.7153        0.50000       0.544348        0.6876

Building Ensemble

#RF

ens_input_rf_train_predict <- predict(modfit.rf, train, type="prob")
colnames(ens_input_rf_train_predict)<-paste(colnames(ens_input_rf_train_predict),"rf",sep="_")

#GBM
ens_input_gbm_train_predict<- predict(modfit.gbm,train, type="prob")
colnames(ens_input_gbm_train_predict)<-paste(colnames(ens_input_gbm_train_predict),"gbm",sep="_")

#Mlogit
ens_input_ml_train_predict <- predict(modfit.mlogit, train, type="prob")
colnames(ens_input_ml_train_predict)<-paste(colnames(ens_input_ml_train_predict),"ml",sep="_")

# Naive Bayes

ens_input_nb_train_predict <- predict(modfit.naivebayes, train, type="raw")
colnames(ens_input_nb_train_predict)<-paste(colnames(ens_input_nb_train_predict),"nb",sep="_")

# SVM

ens_input_svm_train_predict <- predict(modfit.svm, train,decision.values = FALSE,  probability = TRUE)
ens_input_svm_train_predict<- attr(ens_input_svm_train_predict, "probabilities")
colnames(ens_input_svm_train_predict)<-paste(colnames(ens_input_svm_train_predict),"svm",sep="_")

# Ensemble with main data and predictions
ens_input <- cbind( train,ens_input_rf_train_predict,
                    ens_input_gbm_train_predict,
                    ens_input_ml_train_predict,
                    ens_input_nb_train_predict,
                    ens_input_svm_train_predict
)
modfit.rf.ens <- randomForest(New.Provider ~. , data=ens_input)

# Ensemble with predictions
New.Provider<- train$New.Provider
ens_input_new <- cbind( New.Provider,ens_input_rf_train_predict,
                        ens_input_gbm_train_predict,
                        ens_input_ml_train_predict,
                        ens_input_nb_train_predict,
                        ens_input_svm_train_predict
)
modfit.rf.ens_new <- randomForest(New.Provider ~. , data=ens_input_new)


## Functions
ens_prediction <- function(input){
  #RF
  
  ens_input_rf_predict <- predict(modfit.rf, input, type="prob")
  colnames(ens_input_rf_predict)<-paste(colnames(ens_input_rf_predict),"rf",sep="_")
  
  #GBM
  ens_input_gbm_predict<- predict(modfit.gbm,input, type="prob")
  colnames(ens_input_gbm_predict)<-paste(colnames(ens_input_gbm_predict),"gbm",sep="_")
  
  #Mlogit
  ens_input_ml_predict <- predict(modfit.mlogit, input, type="prob")
  colnames(ens_input_ml_predict)<-paste(colnames(ens_input_ml_predict),"ml",sep="_")
  
  # Naive Bayes
  
  ens_input_nb_train_predict <- predict(modfit.naivebayes, input, type="raw")
  colnames(ens_input_nb_train_predict)<-paste(colnames(ens_input_nb_train_predict),"nb",sep="_")
  
  # SVM
  
  ens_input_svm_predict <- predict(modfit.svm, input,decision.values = FALSE,  probability = TRUE)
  ens_input_svm_predict<- attr(ens_input_svm_predict, "probabilities")
  colnames(ens_input_svm_predict)<-paste(colnames(ens_input_svm_predict),"svm",sep="_")
  
  ens_input <- cbind( input,ens_input_rf_predict,
                      ens_input_gbm_predict,
                      ens_input_ml_predict,
                      ens_input_nb_train_predict,
                      ens_input_svm_predict
  )
  prediction_rf_ens <- predict(modfit.rf.ens, ens_input, type="class")
  return(prediction_rf_ens)
}

ens_new_prediction <- function(input){
  #RF
  
  ens_input_rf_predict <- predict(modfit.rf, input, type="prob")
  colnames(ens_input_rf_predict)<-paste(colnames(ens_input_rf_predict),"rf",sep="_")
  
  #GBM
  ens_input_gbm_predict<- predict(modfit.gbm,input, type="prob")
  colnames(ens_input_gbm_predict)<-paste(colnames(ens_input_gbm_predict),"gbm",sep="_")
  
  #Mlogit
  ens_input_ml_predict <- predict(modfit.mlogit, input, type="prob")
  colnames(ens_input_ml_predict)<-paste(colnames(ens_input_ml_predict),"ml",sep="_")
  
  # Naive Bayes
  
  ens_input_nb_train_predict <- predict(modfit.naivebayes, input, type="raw")
  colnames(ens_input_nb_train_predict)<-paste(colnames(ens_input_nb_train_predict),"nb",sep="_")
  
  # SVM
  
  ens_input_svm_predict <- predict(modfit.svm, input,decision.values = FALSE,  probability = TRUE)
  ens_input_svm_predict<- attr(ens_input_svm_predict, "probabilities")
  colnames(ens_input_svm_predict)<-paste(colnames(ens_input_svm_predict),"svm",sep="_")
  
  ens_input <- cbind( ens_input_rf_predict,
                      ens_input_gbm_predict,
                      ens_input_ml_predict,
                      ens_input_nb_train_predict,
                      ens_input_svm_predict
  )
  prediction_rf_ens <- predict(modfit.rf.ens_new, ens_input, type="class")
  return(prediction_rf_ens)
}

Ensamble Confusion Matrices

confusionMatrix(ens_prediction(train), train$New.Provider)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     746       0       1      3
##    Aramark     0      55       0      0
##    Compass     0       0     177      0
##    Sodexo      0       0       0    998
## 
## Overall Statistics
##                                           
##                Accuracy : 0.998           
##                  95% CI : (0.9948, 0.9994)
##     No Information Rate : 0.5056          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9966          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                1.0000        1.00000        0.99438        0.9970
## Specificity                0.9968        1.00000        1.00000        1.0000
## Pos Pred Value             0.9947        1.00000        1.00000        1.0000
## Neg Pred Value             1.0000        1.00000        0.99945        0.9969
## Prevalence                 0.3768        0.02778        0.08990        0.5056
## Detection Rate             0.3768        0.02778        0.08939        0.5040
## Detection Prevalence       0.3788        0.02778        0.08939        0.5040
## Balanced Accuracy          0.9984        1.00000        0.99719        0.9985
confusionMatrix(ens_prediction(test), test$New.Provider)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     146       1       8     37
##    Aramark     0      11       0      0
##    Compass     4       0      31      5
##    Sodexo     37       2       5    209
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8004          
##                  95% CI : (0.7625, 0.8347)
##     No Information Rate : 0.506           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6606          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.7807        0.78571        0.70455        0.8327
## Specificity                0.8511        1.00000        0.98009        0.8204
## Pos Pred Value             0.7604        1.00000        0.77500        0.8261
## Neg Pred Value             0.8651        0.99381        0.97149        0.8272
## Prevalence                 0.3770        0.02823        0.08871        0.5060
## Detection Rate             0.2944        0.02218        0.06250        0.4214
## Detection Prevalence       0.3871        0.02218        0.08065        0.5101
## Balanced Accuracy          0.8159        0.89286        0.84232        0.8265
confusionMatrix(ens_new_prediction(train), train$New.Provider)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     746       0       1      3
##    Aramark     0      55       0      0
##    Compass     0       0     177      0
##    Sodexo      0       0       0    998
## 
## Overall Statistics
##                                           
##                Accuracy : 0.998           
##                  95% CI : (0.9948, 0.9994)
##     No Information Rate : 0.5056          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9966          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                1.0000        1.00000        0.99438        0.9970
## Specificity                0.9968        1.00000        1.00000        1.0000
## Pos Pred Value             0.9947        1.00000        1.00000        1.0000
## Neg Pred Value             1.0000        1.00000        0.99945        0.9969
## Prevalence                 0.3768        0.02778        0.08990        0.5056
## Detection Rate             0.3768        0.02778        0.08939        0.5040
## Detection Prevalence       0.3788        0.02778        0.08939        0.5040
## Balanced Accuracy          0.9984        1.00000        0.99719        0.9985
confusionMatrix(ens_new_prediction(test), test$New.Provider)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Aramark Compass Sodexo
##    Other     145       1       9     32
##    Aramark     0      10       0      2
##    Compass     4       1      30      5
##    Sodexo     38       2       5    212
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8004          
##                  95% CI : (0.7625, 0.8347)
##     No Information Rate : 0.506           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.6604          
##                                           
##  Mcnemar's Test P-Value : 0.6177          
## 
## Statistics by Class:
## 
##                      Class: Other Class: Aramark Class: Compass Class: Sodexo
## Sensitivity                0.7754        0.71429        0.68182        0.8446
## Specificity                0.8641        0.99585        0.97788        0.8163
## Pos Pred Value             0.7754        0.83333        0.75000        0.8249
## Neg Pred Value             0.8641        0.99174        0.96930        0.8368
## Prevalence                 0.3770        0.02823        0.08871        0.5060
## Detection Rate             0.2923        0.02016        0.06048        0.4274
## Detection Prevalence       0.3770        0.02419        0.08065        0.5181
## Balanced Accuracy          0.8197        0.85507        0.82985        0.8305

Ensemble Model for Probabilities

# Ensemble with predictions for Probability 
New.Provider<- train$New.Provider
ens_input_new <- cbind( New.Provider,ens_input_rf_train_predict,
                        ens_input_gbm_train_predict,
                        ens_input_ml_train_predict,
                        ens_input_nb_train_predict,
                        ens_input_svm_train_predict
)
modfit.rf.ens_new_prob <- randomForest(New.Provider ~. , data=ens_input_new,  probability = TRUE)


ens_new_prob_prediction <- function(input){
  #RF
  
  ens_input_rf_predict <- predict(modfit.rf, input, type="prob")
  colnames(ens_input_rf_predict)<-paste(colnames(ens_input_rf_predict),"rf",sep="_")
  
  #GBM
  ens_input_gbm_predict<- predict(modfit.gbm,input, type="prob")
  colnames(ens_input_gbm_predict)<-paste(colnames(ens_input_gbm_predict),"gbm",sep="_")
  
  #Mlogit
  ens_input_ml_predict <- predict(modfit.mlogit, input, type="prob")
  colnames(ens_input_ml_predict)<-paste(colnames(ens_input_ml_predict),"ml",sep="_")
  
  # Naive Bayes
  
  ens_input_nb_train_predict <- predict(modfit.naivebayes, input, type="raw")
  colnames(ens_input_nb_train_predict)<-paste(colnames(ens_input_nb_train_predict),"nb",sep="_")
  
  # SVM
  
  ens_input_svm_predict <- predict(modfit.svm, input,decision.values = FALSE,  probability = TRUE)
  ens_input_svm_predict<- attr(ens_input_svm_predict, "probabilities")
  colnames(ens_input_svm_predict)<-paste(colnames(ens_input_svm_predict),"svm",sep="_")
  
  ens_input <- cbind( ens_input_rf_predict,
                      ens_input_gbm_predict,
                      ens_input_ml_predict,
                      ens_input_nb_train_predict,
                      ens_input_svm_predict
  )
  prediction_rf_ens <- predict(modfit.rf.ens_new_prob, ens_input,type="prob")
  return(prediction_rf_ens)
}

Simulate Data for Mean Comparisions

Check which are factor variables

fdata <- rbind( train, test)
sapply(fdata,is.factor)
##               New.Provider               Old.Provider 
##                       TRUE                       TRUE 
##  Opportunity.Global.Region            Service.Level.1 
##                       TRUE                       TRUE 
##       Strategic.Segment.L1 Regional.Strategic.Account 
##                       TRUE                       TRUE 
##         Revenue.Mill.Euros                     Margin 
##                      FALSE                      FALSE 
##                isb_Aramark                isb_Compass 
##                      FALSE                      FALSE 
##                 isb_Sodexo                psq_Aramark 
##                      FALSE                      FALSE 
##                psq_Compass                 psq_Sodexo 
##                      FALSE                      FALSE 
##                 pb_Aramark                 pb_Compass 
##                      FALSE                      FALSE 
##                  pb_Sodexo                 pm_Aramark 
##                      FALSE                      FALSE 
##                 pm_Compass                  pm_Sodexo 
##                      FALSE                      FALSE 
##                com_Aramark                com_Compass 
##                      FALSE                      FALSE 
##                 com_Sodexo                saf_Aramark 
##                      FALSE                      FALSE 
##                saf_Compass                 saf_Sodexo 
##                      FALSE                      FALSE 
##                sus_Aramark                sus_Compass 
##                      FALSE                      FALSE 
##                 sus_Sodexo                oss_Aramark 
##                      FALSE                      FALSE 
##                oss_Compass                 oss_Sodexo 
##                      FALSE                      FALSE

Simulate Data

simulated_data<- expand.grid(
   Old.Provider                = unique(fdata$Old.Provider),
   Opportunity.Global.Region   = unique(fdata$Opportunity.Global.Region),
   Service.Level.1             = unique(fdata$Service.Level.1),
   Strategic.Segment.L1        = unique(fdata$Strategic.Segment.L1),
   Regional.Strategic.Account  = unique(fdata$Regional.Strategic.Account)
)
is.fact <- sapply(fdata, is.factor)
not.factors.df <- fdata[, !is.fact]
means <- as.data.frame(t(colMeans(not.factors.df)))
simulated_data <- cbind(simulated_data,means )
head(simulated_data)
##   Old.Provider Opportunity.Global.Region Service.Level.1 Strategic.Segment.L1
## 1      Compass                     NORAM         Soft FM               Mining
## 2        Other                     NORAM         Soft FM               Mining
## 3       Sodexo                     NORAM         Soft FM               Mining
## 4      Aramark                     NORAM         Soft FM               Mining
## 5      Compass                    BRAZIL         Soft FM               Mining
## 6        Other                    BRAZIL         Soft FM               Mining
##   Regional.Strategic.Account Revenue.Mill.Euros   Margin isb_Aramark
## 1                          0           1.358793 14.44377    10.00878
## 2                          0           1.358793 14.44377    10.00878
## 3                          0           1.358793 14.44377    10.00878
## 4                          0           1.358793 14.44377    10.00878
## 5                          0           1.358793 14.44377    10.00878
## 6                          0           1.358793 14.44377    10.00878
##   isb_Compass isb_Sodexo psq_Aramark psq_Compass psq_Sodexo pb_Aramark
## 1    2.326788    1.49172    20.46673    11.88457   29.28386   12.92972
## 2    2.326788    1.49172    20.46673    11.88457   29.28386   12.92972
## 3    2.326788    1.49172    20.46673    11.88457   29.28386   12.92972
## 4    2.326788    1.49172    20.46673    11.88457   29.28386   12.92972
## 5    2.326788    1.49172    20.46673    11.88457   29.28386   12.92972
## 6    2.326788    1.49172    20.46673    11.88457   29.28386   12.92972
##   pb_Compass pb_Sodexo pm_Aramark pm_Compass pm_Sodexo com_Aramark com_Compass
## 1   1.052027 0.1134526   1.621887   7.247475  8.539377    2.401373     3.02877
## 2   1.052027 0.1134526   1.621887   7.247475  8.539377    2.401373     3.02877
## 3   1.052027 0.1134526   1.621887   7.247475  8.539377    2.401373     3.02877
## 4   1.052027 0.1134526   1.621887   7.247475  8.539377    2.401373     3.02877
## 5   1.052027 0.1134526   1.621887   7.247475  8.539377    2.401373     3.02877
## 6   1.052027 0.1134526   1.621887   7.247475  8.539377    2.401373     3.02877
##   com_Sodexo saf_Aramark saf_Compass saf_Sodexo sus_Aramark sus_Compass
## 1  0.9122038    3.167799    5.648939   3.304156    3.834066    7.316548
## 2  0.9122038    3.167799    5.648939   3.304156    3.834066    7.316548
## 3  0.9122038    3.167799    5.648939   3.304156    3.834066    7.316548
## 4  0.9122038    3.167799    5.648939   3.304156    3.834066    7.316548
## 5  0.9122038    3.167799    5.648939   3.304156    3.834066    7.316548
## 6  0.9122038    3.167799    5.648939   3.304156    3.834066    7.316548
##   sus_Sodexo oss_Aramark oss_Compass oss_Sodexo
## 1   14.63913    3.426682  0.03233892 0.03054534
## 2   14.63913    3.426682  0.03233892 0.03054534
## 3   14.63913    3.426682  0.03233892 0.03054534
## 4   14.63913    3.426682  0.03233892 0.03054534
## 5   14.63913    3.426682  0.03233892 0.03054534
## 6   14.63913    3.426682  0.03233892 0.03054534

Predictions in Simulated Data

simulated_data_predictions <- cbind(simulated_data,as.data.frame(ens_new_prob_prediction(simulated_data)))

My PDP Graphs

  • Old Provider
plot_data <- simulated_data_predictions %>% group_by(Old.Provider)%>% summarise_at(vars(Other:Sodexo), mean, na.rm = TRUE) %>% gather(`New Provider`, Probability, Other :Sodexo)%>% mutate(Probability = round( Probability*100, digits = 2)) %>% ggplot(  aes(fill=`New Provider`, y=Probability, x=Old.Provider,  label=Probability))+ ylim(0,100)+    geom_text(position = position_dodge(width = .9),    # move to center of bars
              vjust = -0.5,    # nudge above top of bar
              size = 2.5) +
    geom_bar(position="dodge", stat="identity")+scale_fill_brewer(palette="Blues")+  
   #theme(        panel.grid.major = element_blank(),        panel.grid.minor = element_blank(),        panel.background = element_rect(fill = "transparent",colour = NA),
        #plot.background = element_rect(fill = "transparent",colour = NA)        )
  labs(        x ="Old Provider", y = "Predicted Probability (%)")+
  ggtitle("PDP Plot for Predicted Win Probabilities by Old Provider") +
  theme(plot.title = element_text(hjust = 0.5))
plot_data

  • Opportunity Global Region
plot_data <- simulated_data_predictions %>% group_by(Opportunity.Global.Region)%>% summarise_at(vars(Other:Sodexo), mean, na.rm = TRUE) %>% gather(`New Provider`, Probability, Other :Sodexo)%>% mutate(Probability = round( Probability*100, digits = 2)) %>% ggplot(  aes(fill=`New Provider`, y=Probability, x=Opportunity.Global.Region,  label = Probability))+ ylim(0,100)+    geom_text(position = position_dodge(width = .9),    # move to center of bars
              vjust = -0.5,    # nudge above top of bar
              size = 2.5) +
    geom_bar(position="dodge", stat="identity")+scale_fill_brewer(palette="Blues")+  
  labs(        x ="Opportunity Global Region", y = "Predicted Probability (%)")+
  ggtitle("PDP Plot for Predicted Win Probabilities by Opportunity Global Region") +
  theme(plot.title = element_text(hjust = 0.5))
plot_data

  • Service Level 1
plot_data <- simulated_data_predictions %>% group_by(Service.Level.1)%>% summarise_at(vars(Other:Sodexo), mean, na.rm = TRUE) %>% gather(`New Provider`, Probability, Other :Sodexo)%>% mutate(Probability = round( Probability*100, digits = 2)) %>% ggplot(  aes(fill=`New Provider`, y=Probability, x=Service.Level.1,  label = Probability))+ ylim(0,100)+    geom_text(position = position_dodge(width = .9),    # move to center of bars
              vjust = -0.5,    # nudge above top of bar
              size = 2.5) +
    geom_bar(position="dodge", stat="identity")+scale_fill_brewer(palette="Blues")+  
  labs(        x ="Service Level 1", y = "Predicted Probability (%)")+
  ggtitle("PDP Plot for Predicted Win Probabilities by Service Level 1") +
  theme(plot.title = element_text(hjust = 0.5))
plot_data

  • Strategic Segment L1
plot_data <- simulated_data_predictions %>% group_by(Strategic.Segment.L1 )%>% summarise_at(vars(Other:Sodexo), mean, na.rm = TRUE) %>% gather(`New Provider`, Probability, Other :Sodexo)%>% mutate(Probability = round( Probability*100, digits = 2)) %>% ggplot(  aes(fill=`New Provider`, y=Probability, x=Strategic.Segment.L1 ,  label = Probability))+ ylim(0,100)+    geom_text(position = position_dodge(width = .9),    # move to center of bars
              vjust = -0.5,    # nudge above top of bar
              size = 2.5) +
    geom_bar(position="dodge", stat="identity")+scale_fill_brewer(palette="Blues")+  
  labs(        x ="Strategic Segment L1", y = "Predicted Probability (%)")+
  ggtitle("PDP Plot for Predicted Win Probabilities by Strategic Segment L1") +
  theme(plot.title = element_text(hjust = 0.5))
plot_data

  • Regional Strategic Account
plot_data <- simulated_data_predictions %>% group_by(Regional.Strategic.Account )%>% summarise_at(vars(Other:Sodexo), mean, na.rm = TRUE) %>% gather(`New Provider`, Probability, Other :Sodexo)%>% mutate(Probability = round( Probability*100, digits = 2)) %>% ggplot(  aes(fill=`New Provider`, y=Probability, x=Regional.Strategic.Account ,  label = Probability))+ ylim(0,100)+    geom_text(position = position_dodge(width = .9),    # move to center of bars
              vjust = -0.5,    # nudge above top of bar
              size = 2.5) +
    geom_bar(position="dodge", stat="identity")+scale_fill_brewer(palette="Blues")+  
  labs(        x ="Regional Strategic Account", y = "Predicted Probability (%)")+
  ggtitle("PDP Plot for Predicted Win Probabilities by Regional Strategic Account") +
  theme(plot.title = element_text(hjust = 0.5))
  
knitr::opts_chunk$set(fig.width=unit(18,"cm"), fig.height=unit(11,"cm")) 
plot_data

PDP for Continuous Variables

## Function to create PDP's
pdp_singl_continuous <- function(name){
   pars <- as.list(match.call()[-1])
  x_s <- simulated_data[,as.character(pars$name)]   # grid where we want partial dependencies
  #print(names(x_s))
  a <- as.character(pars$name)
  x_c <-  dplyr::select(simulated_data, - a)  # other predictors
  #print(names(x_c))
  x_s <-  seq(min( fdata[,as.character(pars$name)]), max(fdata[,as.character(pars$name)]), length.out = 100)
  
  simulated_data_new <- merge(x_s , x_c) 
   colnames(simulated_data_new)[1] <- as.character(pars$name)
  simulated_data_new_predictions <- cbind(simulated_data_new,as.data.frame(ens_new_prob_prediction(simulated_data_new)))
    #print(substitute(name))
  pd <- simulated_data_new_predictions %>%
  group_by(!!var) %>%
  summarise(across(c("Other", "Aramark","Compass","Sodexo" ), mean, na.rm = TRUE))%>% gather(`New Provider`, Probability, Other :Sodexo)%>% mutate(Probability = round( Probability*100, digits = 2))

 return(pd)
}
  • Revenue in Million Euros
var <- quo(Revenue.Mill.Euros)
pdp_data <-pdp_singl_continuous("Revenue.Mill.Euros")
library(plotly) 

fig <- plot_ly(pdp_data, x = ~Revenue.Mill.Euros, y = ~Probability, color = ~`New Provider`) 
fig <- fig %>% add_lines()%>% layout(title="PDP Plot for Predicted Win Probabilities - Revenue in Million Euros")

fig
  • Margin
var <- quo(Margin)
pdp_data <-pdp_singl_continuous("Margin")
library(plotly) 

fig <- plot_ly(pdp_data, x = ~Margin, y = ~Probability, color = ~`New Provider`) 
fig <- fig %>% add_lines()%>% layout(title="PDP Plot for Predicted Win Probabilities - Margin")

fig
  • Sodexo
    • Initial Sales and Bidding
var <- quo(isb_Sodexo)
pdp_data <-pdp_singl_continuous("isb_Sodexo")
library(plotly) 

fig <- plot_ly(pdp_data, x = ~isb_Sodexo, y = ~Probability, color = ~`New Provider`) 
fig <- fig %>% add_lines()%>% layout(title="PDP Plot for Predicted Win Probabilities: Initial Sales and Bidding - Sodexo")

fig
  • Product and Service Quality
var <- quo(psq_Sodexo)
pdp_data <-pdp_singl_continuous("psq_Sodexo")
library(plotly) 

fig <- plot_ly(pdp_data, x = ~psq_Sodexo, y = ~Probability, color = ~`New Provider`) 
fig <- fig %>% add_lines()%>% layout(title="PDP Plot for Predicted Win Probabilities: Product and Service Quality - Sodexo")

fig
  • Pricing and Billing
var <- quo(pb_Sodexo)
pdp_data <-pdp_singl_continuous("pb_Sodexo")
library(plotly) 

fig <- plot_ly(pdp_data, x = ~pb_Sodexo, y = ~Probability, color = ~`New Provider`) 
fig <- fig %>% add_lines()%>% layout(title="PDP Plot for Predicted Win Probabilities: Pricing and Billing - Sodexo")

fig
  • Pricing and Billing
var <- quo(pb_Sodexo)
pdp_data <-pdp_singl_continuous("pb_Sodexo")
library(plotly) 

fig <- plot_ly(pdp_data, x = ~pb_Sodexo, y = ~Probability, color = ~`New Provider`) 
fig <- fig %>% add_lines()%>% layout(title="PDP Plot for Predicted Win Probabilities: Pricing and Billing - Sodexo")

fig
  • Project Management
var <- quo(pm_Sodexo)
pdp_data <-pdp_singl_continuous("pm_Sodexo")
library(plotly) 

fig <- plot_ly(pdp_data, x = ~pm_Sodexo, y = ~Probability, color = ~`New Provider`) 
fig <- fig %>% add_lines()%>% layout(title="PDP Plot for Predicted Win Probabilities: Project Management - Sodexo")

fig
  • Communication
var <- quo(com_Sodexo)
pdp_data <-pdp_singl_continuous("com_Sodexo")
library(plotly) 

fig <- plot_ly(pdp_data, x = ~com_Sodexo, y = ~Probability, color = ~`New Provider`) 
fig <- fig %>% add_lines()%>% layout(title="PDP Plot for Predicted Win Probabilities: Communication - Sodexo")

fig
  • Safety
var <- quo(saf_Sodexo)
pdp_data <-pdp_singl_continuous("saf_Sodexo")
library(plotly) 

fig <- plot_ly(pdp_data, x = ~saf_Sodexo, y = ~Probability, color = ~`New Provider`) 
fig <- fig %>% add_lines()%>% layout(title="PDP Plot for Predicted Win Probabilities: Safety - Sodexo")

fig
  • Sustainability
var <- quo(sus_Sodexo)
pdp_data <-pdp_singl_continuous("sus_Sodexo")
library(plotly) 

fig <- plot_ly(pdp_data, x = ~sus_Sodexo, y = ~Probability, color = ~`New Provider`) 
fig <- fig %>% add_lines()%>% layout(title="PDP Plot for Predicted Win Probabilities: Sustainability - Sodexo")

fig
  • Ongoing Service Support
var <- quo(oss_Sodexo)
pdp_data <-pdp_singl_continuous("oss_Sodexo")
library(plotly) 

fig <- plot_ly(pdp_data, x = ~oss_Sodexo, y = ~Probability, color = ~`New Provider`) 
fig <- fig %>% add_lines()%>% layout(title="PDP Plot for Predicted Win Probabilities: Ongoing Service Support - Sodexo")

fig